Now that you’ve demonstrated your software is setup, and you have the basics of data manipulation, the goal of this assignment is to practice transforming, visualising, and exploring data.
In July 2012, in the aftermath of a mass shooting in a movie theater in Aurora, Colorado, Mother Jones published a report on mass shootings in the United States since 1982. Importantly, they provided the underlying data set as an open-source database for anyone interested in studying and understanding this criminal behavior.
## Rows: 125
## Columns: 14
## $ case <chr> "Oxford High School shooting", "San Jose VTA shoo…
## $ year <dbl> 2021, 2021, 2021, 2021, 2021, 2021, 2020, 2020, 2…
## $ month <chr> "Nov", "May", "Apr", "Mar", "Mar", "Mar", "Mar", …
## $ day <dbl> 30, 26, 15, 31, 22, 16, 16, 26, 10, 6, 31, 4, 3, …
## $ location <chr> "Oxford, Michigan", "San Jose, California", "Indi…
## $ summary <chr> "Ethan Crumbley, a 15-year-old student at Oxford …
## $ fatalities <dbl> 4, 9, 8, 4, 10, 8, 4, 5, 4, 3, 7, 9, 22, 3, 12, 5…
## $ injured <dbl> 7, 0, 7, 1, 0, 1, 0, 0, 3, 8, 25, 27, 26, 12, 4, …
## $ total_victims <dbl> 11, 9, 15, 5, 10, 9, 4, 5, 7, 11, 32, 36, 48, 15,…
## $ location_type <chr> "School", "Workplace", "Workplace", "Workplace", …
## $ male <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, T…
## $ age_of_shooter <dbl> 15, 57, 19, NA, 21, 21, 31, 51, NA, NA, 36, 24, 2…
## $ race <chr> NA, NA, "White", NA, NA, "White", NA, "Black", "B…
## $ prior_mental_illness <chr> NA, "Yes", "Yes", NA, "Yes", NA, NA, NA, NA, NA, …
| column(variable) | description |
|---|---|
| case | short name of incident |
| year, month, day | year, month, day in which the shooting occurred |
| location | city and state where the shooting occcurred |
| summary | brief description of the incident |
| fatalities | Number of fatalities in the incident, excluding the shooter |
| injured | Number of injured, non-fatal victims in the incident, excluding the shooter |
| total_victims | number of total victims in the incident, excluding the shooter |
| location_type | generic location in which the shooting took place |
| male | logical value, indicating whether the shooter was male |
| age_of_shooter | age of the shooter when the incident occured |
| race | race of the shooter |
| prior_mental_illness | did the shooter show evidence of mental illness prior to the incident? |
mass_shootings %>%
group_by(year) %>%
summarise(count = n()) %>%
arrange(count)## # A tibble: 37 × 2
## year count
## <dbl> <int>
## 1 1982 1
## 2 1986 1
## 3 1987 1
## 4 1988 1
## 5 1990 1
## 6 1994 1
## 7 1995 1
## 8 1996 1
## 9 2000 1
## 10 2001 1
## # ℹ 27 more rows
mass_shootings %>%
mutate(race = ifelse(is.na(race), "Unknown", race)) %>%
filter(!is.na(race)) %>%
group_by(race) %>%
summarise(race_count = n()) %>%
mutate(race = fct_rev(fct_reorder(race, race_count, max))) %>%
ggplot(aes(x =race, y=race_count))+
geom_bar(position = 'dodge', stat='identity')+
theme_minimal(base_size=6) +
geom_text(aes(label=race_count), position=position_dodge(width=0.9), vjust=-0.25) +
labs(x = "Race") +
ggtitle("Mass shooters in the US are predominantly white") +
ggthemes::theme_economist()mass_shootings %>%
ggplot(aes(x = location_type, y = total_victims)) +
geom_boxplot() +
labs(x = "Location Type", y = "Total Victim Count") +
ggtitle("Most Mass Shootings have between 5-30 victims")mass_shootings %>%
filter(case != 'Las Vegas Strip massacre') %>%
ggplot(aes(x = location_type, y = total_victims)) +
geom_boxplot() +
labs(x = "Location Type", y = "Total Victim Count") +
ggtitle("Most Mass Shootings have between 5-30 victims")Address the following questions. Generate appropriate figures/tables to support your conclusions.
mass_shootings %>%
mutate(race = ifelse(is.na(race), "Unknown", race)) %>%
mutate(prior_mental_illness = ifelse(is.na(prior_mental_illness), "Unknown", prior_mental_illness)) %>%
filter(male & race == 'White' & year > 2000) %>%
group_by(prior_mental_illness) %>%
summarise(pmi_count = n()) %>%
mutate(prior_mental_illness = fct_rev(fct_reorder(prior_mental_illness, pmi_count, max))) %>%
ggplot(aes(x =prior_mental_illness, y=pmi_count))+
geom_bar(position = 'dodge', stat='identity')+
theme_minimal(base_size=6) +
geom_text(aes(label=pmi_count), position=position_dodge(width=0.9), vjust=-0.25) +
labs(x = "Prior Mental Illness", y='') +
ggtitle("Most White Male Mass-Shooters display signs of prior mental illness") +
ggthemes::theme_economist() +
theme(plot.title = element_text(size = 14)) # Adjust the size value as neededmass_shootings %>%
group_by(month) %>%
summarise(month_count = n()) %>%
mutate(month_num = as.numeric(match(month, month.abb))) %>%
mutate(month = fct_reorder(month, month_num, max)) %>%
ggplot(aes(x =month, y=month_count))+
geom_bar(position = 'dodge', stat='identity')+
theme_minimal(base_size=6) +
geom_text(aes(label=month_count), position=position_dodge(width=0.9), vjust=-0.25) +
labs(x = "Month", y='', subtitle = "Less shooting happen in summer months") +
ggtitle('Most shootings happen in Feb, the shortest month') +
ggthemes::theme_economist() +
theme(plot.title = element_text(size = 14)) # Adjust the size value as neededmass_shootings %>%
filter(race %in% c('White', 'Black')) %>%
ggplot(aes(x = fatalities, fill = race)) +
geom_density(alpha = 0.6) +
scale_fill_manual(values = c("blue", "red"),
guide = guide_legend(override.aes = list(alpha = 1))) +
labs(x = "Fatalities", y='Freqiency', subtitle = "") +
ggtitle('Shootings perpretrated by black mass shooters tend to have fewer fatalities') +
ggthemes::theme_economist() +
theme(plot.title = element_text(size = 10)) # Adjust the size value as neededmass_shootings %>%
filter(race %in% c('White', 'Latino')) %>%
ggplot(aes(x = fatalities, fill = race)) +
geom_density(alpha = 0.6) +
scale_fill_manual(values = c("blue", "green"),
guide = guide_legend(override.aes = list(alpha = 1))) +
labs(x = "Fatalities", y='Freqiency', subtitle = "") +
ggtitle('Similarly with Latino') +
ggthemes::theme_economist() +
theme(plot.title = element_text(size = 10)) # Adjust the size value as neededmass_shootings %>%
mutate(fatality_ratio = fatalities/total_victims) %>%
ggplot(aes(x = prior_mental_illness, y = fatality_ratio)) +
geom_boxplot() +
labs(title = "Fatality ratios (Fatalities/Total Victims) are more concentrated\namongst those without prior mental illness sign",
x = "Prior Mental Illness",
y = "Fatality Ratio",
subtitle = "Suggests that those without prior mental illness signs\ncould have a more methodological approach to their crime") +
ggthemes::theme_economist() +
theme(plot.title = element_text(size = 14)) # Adjust the size value as neededmass_shootings %>%
mutate(race = fct_infreq(race)) %>%
group_by(race, prior_mental_illness) %>%
summarise(avg_age = mean(age_of_shooter, na.rm = TRUE)) %>%
ggplot(aes(x = race, y = avg_age, fill = prior_mental_illness)) +
geom_tile(stat = "identity", position = "dodge") +
labs(title = "For black mass shooters, those with prior mental health\nillness signs tend to be younger than those without",
x = "Race",
y = "Average Age",
subtitle = "White mass shooters tend to be a similar age\nregardless of whether they have had prior mental illness signs") +
scale_fill_manual(values = c("blue", "red", "gray"),
limits = c("Yes", "No", NA),
labels = c("Yes", "No", "Unknown")) +
ggthemes::theme_economist() +
theme(plot.title = element_text(size = 14)) # Adjust the size value as neededmass_shootings %>%
ggplot(aes(x = prior_mental_illness, y = total_victims)) +
coord_cartesian(ylim = c(quantile(mass_shootings$total_victims, 0.05), quantile(mass_shootings$total_victims, 0.95))) +
geom_boxplot() +
labs(title = "Total victims tend to be higher for shooters with\nprior mental illness signs",
x = "Prior Mental Illness",
y = "Total Victim Count",
subtitle = "Total victim count also tends to be more volatile") +
ggthemes::theme_economist() +
theme(plot.title = element_text(size = 14)) # Adjust the size value as neededmass_shootings %>%
mutate(location_type = fct_infreq(location_type)) %>%
group_by(prior_mental_illness, location_type) %>%
summarize(count = n()) %>%
ggplot(aes(x = location_type, y = count, fill = prior_mental_illness)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Workspaces and Schools are common location_types for\n mass shooters with and without mental illness signs",
x = "Location Type",
y = "Frequency",
subtitle = "Mass shooters who chose military, reglious or airports\nall had prior signs of mental illness") +
scale_fill_manual(values = c("blue", "red", "gray"),
limits = c("Yes", "No", NA),
labels = c("Yes", "No", "Unknown")) +
ggthemes::theme_economist() +
theme(plot.title = element_text(size = 14)) # Adjust the size value as neededmass_shootings %>%
mutate(location_type = ifelse(is.na(location_type), "Unknown", location_type)) %>%
# Restrict Location Type to only workplace school or other
mutate(location_type = case_when(
location_type %in% c("Workplace", "School") ~ as.character(location_type),
TRUE ~ "Other"
)) %>%
mutate(location_type = fct_rev(fct_infreq(location_type))) %>%
group_by(prior_mental_illness, location_type) %>%
summarize(avg_victim_count = median(total_victims, na.rm = TRUE)) %>%
ggplot(aes(x = prior_mental_illness , y = avg_victim_count, fill = location_type)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Mass shootings at schools have the highest median victim counts",
x = "Prior Mental Illness",
y = "Total Victims (Median)",
subtitle = "This trend is more pronounced amongst those with prior mental health illness") +
ggthemes::theme_economist() +
theme(plot.title = element_text(size = 14)) # Adjust the size value as neededIt is clear that those with prior signs of mental illness have a more volatility in total victim and fatality ratios in their shootings. This may be because the crimes are less pre-planned.
Workplaces and schools tend to be the most populated locations for mass shooters regardless of their mental health illness status. Interestingly, mass shooters who have committed their crimes in Military or Airport facilities all had prior mental illness signs. This is probably because people working in these in these locations would have to go through more background checks.
Despite schools being less common mass shooting locations than workplaces or others, they tend to have a higher median number of total victims. This discrepancy tends to be higher in shooters with prior mental health illness.
In general, black mass shooters tend to be older than white mass shooters. For white mass shooters, those with prior mental illness tend to be older than those without. For black and Latino mass shooters, those with prior mental illness tend to be younger
Make sure to provide a couple of sentences of written interpretation of your tables/figures. Graphs and tables alone will not be sufficient to answer this question.
We will be using a dataset with credit card transactions containing legitimate and fraud transactions. Fraud is typically well below 1% of all transactions, so a naive model that predicts that all transactions are legitimate and not fraudulent would have an accuracy of well over 99%– pretty good, no? (well, not quite as we will see later in the course)
You can read more on credit card fraud on Credit Card Fraud Detection Using Weighted Support Vector Machine
The dataset we will use consists of credit card transactions and it includes information about each transaction including customer details, the merchant and category of purchase, and whether or not the transaction was a fraud.
The dataset is too large to be hosted on Canvas or Github, so please download it from dropbox https://www.dropbox.com/sh/q1yk8mmnbbrzavl/AAAxzRtIhag9Nc_hODafGV2ka?dl=0 and save it in your dsb repo, under the data folder
## Rows: 671,028
## Columns: 14
## $ trans_date_trans_time <dttm> 2019-02-22 07:32:58, 2019-02-16 15:07:20, 2019-…
## $ trans_year <dbl> 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2020, …
## $ category <chr> "entertainment", "kids_pets", "personal_care", "…
## $ amt <dbl> 7.79, 3.89, 8.43, 40.00, 54.04, 95.61, 64.95, 3.…
## $ city <chr> "Veedersburg", "Holloway", "Arnold", "Apison", "…
## $ state <chr> "IN", "OH", "MO", "TN", "CO", "GA", "MN", "AL", …
## $ lat <dbl> 40.1186, 40.0113, 38.4305, 35.0149, 39.4584, 32.…
## $ long <dbl> -87.2602, -80.9701, -90.3870, -85.0164, -106.385…
## $ city_pop <dbl> 4049, 128, 35439, 3730, 277, 1841, 136, 190178, …
## $ job <chr> "Development worker, community", "Child psychoth…
## $ dob <date> 1959-10-19, 1946-04-03, 1985-03-31, 1991-01-28,…
## $ merch_lat <dbl> 39.41679, 39.74585, 37.73078, 34.53277, 39.95244…
## $ merch_long <dbl> -87.52619, -81.52477, -91.36875, -84.10676, -106…
## $ is_fraud <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
The data dictionary is as follows
| column(variable) | description |
|---|---|
| trans_date_trans_time | Transaction DateTime |
| trans_year | Transaction year |
| category | category of merchant |
| amt | amount of transaction |
| city | City of card holder |
| state | State of card holder |
| lat | Latitude location of purchase |
| long | Longitude location of purchase |
| city_pop | card holder’s city population |
| job | job of card holder |
| dob | date of birth of card holder |
| merch_lat | Latitude Location of Merchant |
| merch_long | Longitude Location of Merchant |
| is_fraud | Whether Transaction is Fraud (1) or Not (0) |
card_fraud %>%
group_by(trans_year, is_fraud) %>%
count() %>%
group_by(trans_year) %>%
mutate(Percentage = n / sum(n) * 100) %>%
filter(is_fraud == 1)## # A tibble: 2 × 4
## # Groups: trans_year [2]
## trans_year is_fraud n Percentage
## <dbl> <dbl> <int> <dbl>
## 1 2019 1 2721 0.568
## 2 2020 1 1215 0.632
## 0.57% were fraud in 2019 (2721 Transactions)
## 0.63% in 2020 (1215 Transactions)card_fraud %>%
group_by(trans_year, is_fraud) %>%
summarise(total_usd_amt = sum(amt)) %>%
group_by(trans_year) %>%
mutate(pct_total_usd_amt = total_usd_amt / sum(total_usd_amt) * 100) %>%
filter(is_fraud == 1)## # A tibble: 2 × 4
## # Groups: trans_year [2]
## trans_year is_fraud total_usd_amt pct_total_usd_amt
## <dbl> <dbl> <dbl> <dbl>
## 1 2019 1 1423140. 4.23
## 2 2020 1 651949. 4.80
## In terms of total USD amount, fraudulent transactions represent
## a higher proportion of total transactions 4.2% in 2019, 4.8% in 2020card_fraud %>%
mutate(is_fraud = ifelse(is_fraud == 1, "Yes", "No")) %>%
ggplot(aes(x = amt, fill = is_fraud)) +
geom_density(alpha = 0.6) +
scale_fill_manual(values = c("blue", "green"),
guide = guide_legend(override.aes = list(alpha = 1))) +
labs(x = "Amount USD", y='Density', subtitle = "") +
coord_cartesian(xlim = c(quantile(card_fraud$amt, 0.00), quantile(card_fraud$amt, 0.99))) +
ggtitle('Similarly with Latino') +
ggthemes::theme_economist() +
theme(plot.title = element_text(size = 10)) + # Adjust the size value as needed
scale_x_continuous(breaks = round(seq(min(card_fraud$amt), quantile(card_fraud$amt, 0.99), length.out = 20), digits = 0)
)num_fraud <- card_fraud %>%
filter(is_fraud == 1) %>%
count() %>%
pull(n)
card_fraud %>%
filter(is_fraud == 1 & !is.na(category)) %>%
mutate(category = fct_infreq(category)) %>%
group_by(category) %>%
summarize(pct_fraud_transactions = n() / num_fraud * 100) %>%
ggplot(aes(x = category, y = pct_fraud_transactions)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Groceries ands Online Shopping are\nthe most common fraudulent transactions categories",
x = "",
y = "Percentage of Fradulent Transactions",
subtitle = "") +
ggthemes::theme_economist() +
theme(plot.title = element_text(size = 14),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 0.85)) # Adjust the size value as neededlubridate package and the following codecard_fraud_date <- card_fraud %>%
mutate(
date_only = lubridate::date(trans_date_trans_time),
month_name = lubridate::month(trans_date_trans_time, label=TRUE),
month_number = month(date(trans_date_trans_time)),
hour = lubridate::hour(trans_date_trans_time),
weekday = lubridate::wday(trans_date_trans_time, label = TRUE),
weekday_num = lubridate::wday(trans_date_trans_time, label = FALSE)
)
card_fraud_date %>%
filter(is_fraud == 1 & !is.na(month_name)) %>%
mutate(month_name = fct_reorder(month_name, month_number, max)) %>%
group_by(month_name, month_number) %>%
summarize(pct_fraud_transactions = n() / num_fraud * 100) %>%
ggplot(aes(x = month_name, y = pct_fraud_transactions)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Fraud is more likely to occur in the first half of the year than the second",
x = "",
y = "Percentage of Fradulent Transactions",
subtitle = "With march and may having\nhighest percentage of fraudulent transactions") +
ggthemes::theme_economist() +
theme(plot.title = element_text(size = 14),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 0.85))card_fraud_date %>%
filter(is_fraud == 1 & !is.na(hour)) %>%
arrange(hour) %>%
group_by(hour) %>%
summarize(pct_fraud_transactions = n() / num_fraud * 100) %>%
ggplot(aes(x = hour, y = pct_fraud_transactions)) +
geom_bar(stat = "identity", position = "dodge") +
scale_x_continuous(breaks = seq(0, 23, by = 1)) +
labs(title = "A lot of the fraudulent transactions occur just before midnight\nor in the late hours of the night",
x = "Hour",
y = "Percentage of Fradulent Transactions",
subtitle = "11pm to midnight is the most common time for fraudulent transactions to occur") +
ggthemes::theme_economist() +
theme(plot.title = element_text(size = 14),
axis.text.x = element_text(hjust = 0.5, vjust = 0.85))card_fraud_date %>%
filter(is_fraud == 1 & !is.na(weekday)) %>%
mutate(weekday = fct_reorder(weekday, weekday_num, max)) %>%
group_by(weekday, weekday_num) %>%
summarize(pct_fraud_transactions = n() / num_fraud * 100) %>%
ggplot(aes(x = weekday, y = pct_fraud_transactions)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "The weekends and monday are the most likely days for fraud",
x = "",
y = "Percentage of Fradulent Transactions",
subtitle = "Wednesday is the least likely day") +
ggthemes::theme_economist() +
theme(plot.title = element_text(size = 14),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 0.85))lubridate package and the following code mutate(
age = interval(dob, trans_date_trans_time) / years(1),
)
card_fraud_dob <- card_fraud %>%
mutate(
age = interval(dob, trans_date_trans_time) / years(1),)
card_fraud_dob %>%
filter(is_fraud == 1) %>%
arrange(age) %>%
ggplot(aes(x = age)) +
geom_histogram(alpha=0.6, bins=60) +
labs(title = "People in their 20s and Late 40s-60s\nare most likely to be victims of fraud",
x = "Age",
y = "Number of Fradulent Transactions",
subtitle = "People in their 30s tend to be most shrewd") +
ggthemes::theme_economist() +
theme(plot.title = element_text(size = 14),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 0.85)) +
scale_x_continuous(breaks = seq(round(min(card_fraud_dob$age), digits=0), round(max(card_fraud_dob$age), digits=0), by = 3))# distance between card holder's home and transaction
# code adapted from https://www.geeksforgeeks.org/program-distance-two-points-earth/amp/
card_fraud <- card_fraud %>%
mutate(
# convert latitude/longitude to radians
lat1_radians = lat / 57.29577951,
lat2_radians = merch_lat / 57.29577951,
long1_radians = long / 57.29577951,
long2_radians = merch_long / 57.29577951,
# calculate distance in miles
distance_miles = 3963.0 * acos((sin(lat1_radians) * sin(lat2_radians)) + cos(lat1_radians) * cos(lat2_radians) * cos(long2_radians - long1_radians)),
# calculate distance in km
distance_km = 6377.830272 * acos((sin(lat1_radians) * sin(lat2_radians)) + cos(lat1_radians) * cos(lat2_radians) * cos(long2_radians - long1_radians))
)
card_fraud %>%
mutate(is_fraud = ifelse(is_fraud == 1, "Yes", "No")) %>%
ggplot(aes(x = is_fraud, y = distance_miles)) +
geom_boxplot() +
labs(x = "Is Fraud", y = "Distance (Miles)") +
labs(title = "There doesnt see to be much distinction between\ntransaction distances between fraud and non fraudulent transaction",
x = "Age",
y = "Number of Fradulent Transactions",
subtitle = "This means that most fradulent transactions\nare made the same distanes away from the cardholders home")Plot a boxplot or a violin plot that looks at the relationship of distance and is_fraud. Does distance seem to be a useful feature in explaining fraud?
# Download electricity data
url <- "https://nyc3.digitaloceanspaces.com/owid-public/data/energy/owid-energy-data.csv"
energy <- read_csv(url) %>%
filter(year >= 1990) %>%
drop_na(iso_code) %>%
select(1:3,
biofuel = biofuel_electricity,
coal = coal_electricity,
gas = gas_electricity,
hydro = hydro_electricity,
nuclear = nuclear_electricity,
oil = oil_electricity,
other_renewable = other_renewable_exc_biofuel_electricity,
solar = solar_electricity,
wind = wind_electricity,
electricity_demand,
electricity_generation,
net_elec_imports, # Net electricity imports, measured in terawatt-hours
energy_per_capita, # Primary energy consumption per capita, measured in kilowatt-hours Calculated by Our World in Data based on BP Statistical Review of World Energy and EIA International Energy Data
energy_per_gdp, # Energy consumption per unit of GDP. This is measured in kilowatt-hours per 2011 international-$.
per_capita_electricity, # Electricity generation per capita, measured in kilowatt-hours
)
# Download data for C02 emissions per capita https://data.worldbank.org/indicator/EN.ATM.CO2E.PC
co2_percap <- wb_data(country = "countries_only",
indicator = "EN.ATM.CO2E.PC",
start_date = 1990,
end_date = 2022,
return_wide=FALSE) %>%
filter(!is.na(value)) %>%
#drop unwanted variables
select(-c(unit, obs_status, footnote, last_updated)) %>%
rename(year = date,
co2percap = value)
# Download data for GDP per capita https://data.worldbank.org/indicator/NY.GDP.PCAP.PP.KD
gdp_percap <- wb_data(country = "countries_only",
indicator = "NY.GDP.PCAP.PP.KD",
start_date = 1990,
end_date = 2022,
return_wide=FALSE) %>%
filter(!is.na(value)) %>%
#drop unwanted variables
select(-c(unit, obs_status, footnote, last_updated)) %>%
rename(year = date,
GDPpercap = value)There are many sources of data on how countries generate their electricity and their CO2 emissions. I would like you to create three graphs:
energy %>%
filter(country == 'Netherlands' & year >= 2000) %>%
pivot_longer(cols = biofuel:wind, names_to = "fuel", values_to = "twh_generation") %>%
select(c('country', 'year','fuel', 'twh_generation')) %>%
mutate(twh_generation = ifelse(is.na(twh_generation), 0.0, twh_generation)) %>%
ggplot(aes(x = year, y = twh_generation, fill = fuel)) +
geom_area(colour="grey90", alpha = 0.6, position = "fill") +
labs(x = "Year", y = "Percentage Generation", fill = "Fuel",
title='Wind generation in Netherlands has accelerated since 2015',
subtitle='However, gas is still the main source of electricity generation') +
scale_fill_discrete(guide = guide_legend(reverse = TRUE)) +
scale_y_continuous(labels = scales::percent)